In [1]:
import time
start_time = time.time()

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
from pandas import Series,DataFrame
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics
import seaborn as sns
pd.options.display.max_columns = 500
pd.options.display.max_rows = 50
In [2]:
# Import the CSV
df = pd.read_csv("/Users/datasci/Python/CiscoAML/old/dataset-comparison/NIDS-AML-Baselines/NIDS-AML-Baselines-queen.csv", header=0)
/Users/datasci/Python/CiscoAML/datasci/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (20,21) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
In [3]:
# What does this dataframe look like?
df.head()
Out[3]:
Flow ID Src IP Src Port Dst IP Dst Port Protocol Timestamp Flow Duration Total Fwd Packet Total Bwd packets Total Length of Fwd Packet Total Length of Bwd Packet Fwd Packet Length Max Fwd Packet Length Min Fwd Packet Length Mean Fwd Packet Length Std Bwd Packet Length Max Bwd Packet Length Min Bwd Packet Length Mean Bwd Packet Length Std Flow Bytes/s Flow Packets/s Flow IAT Mean Flow IAT Std Flow IAT Max Flow IAT Min Fwd IAT Total Fwd IAT Mean Fwd IAT Std Fwd IAT Max Fwd IAT Min Bwd IAT Total Bwd IAT Mean Bwd IAT Std Bwd IAT Max Bwd IAT Min Fwd PSH Flags Bwd PSH Flags Fwd URG Flags Bwd URG Flags Fwd Header Length Bwd Header Length Fwd Packets/s Bwd Packets/s Packet Length Min Packet Length Max Packet Length Mean Packet Length Std Packet Length Variance FIN Flag Count SYN Flag Count RST Flag Count PSH Flag Count ACK Flag Count URG Flag Count CWE Flag Count ECE Flag Count Down/Up Ratio Average Packet Size Fwd Segment Size Avg Bwd Segment Size Avg Fwd Bytes/Bulk Avg Fwd Packet/Bulk Avg Fwd Bulk Rate Avg Bwd Bytes/Bulk Avg Bwd Packet/Bulk Avg Bwd Bulk Rate Avg Subflow Fwd Packets Subflow Fwd Bytes Subflow Bwd Packets Subflow Bwd Bytes FWD Init Win Bytes Bwd Init Win Bytes Fwd Act Data Pkts Fwd Seg Size Min Active Mean Active Std Active Max Active Min Idle Mean Idle Std Idle Max Idle Min Label
0 192.168.1.105-64.4.54.254-61890-443-6 192.168.1.105 61890 64.4.54.254 443 6 22/04/2019 08:01:12 AM 4 2 0 6.0 0.0 6.0 0.0 3.0 4.242641 0.0 0.0 0.0 0.0 1500000.0 500000.0 4.0 0.0 4.0 4.0 4.0 4.0 0.0 4.0 4.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 40 0 500000.0 0.0 0.0 6.0 2.0 3.464102 12.0 2 0 0 0 2 0 0 0 0.0 3.0 3.0 0.0 0 0 0 0 0 0 1 3 0 0 253 0 1 20 0 0 0 0 0.0 0.0 0.0 0.0 NeedManualLabel
1 64.4.54.254-192.168.1.105-443-61890-6 64.4.54.254 443 192.168.1.105 61890 6 22/04/2019 08:01:12 AM 4 2 0 6.0 0.0 6.0 0.0 3.0 4.242641 0.0 0.0 0.0 0.0 1500000.0 500000.0 4.0 0.0 4.0 4.0 4.0 4.0 0.0 4.0 4.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 40 0 500000.0 0.0 0.0 6.0 2.0 3.464102 12.0 2 0 0 0 2 0 0 0 0.0 3.0 3.0 0.0 0 0 0 0 0 0 1 3 0 0 1024 0 1 20 0 0 0 0 0.0 0.0 0.0 0.0 NeedManualLabel
2 192.168.1.7-192.168.1.9-49666-30692-6 192.168.1.7 49666 192.168.1.9 30692 6 22/04/2019 08:01:16 AM 0 1 1 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 NaN Infinity 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 20 20 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 1 0 0 0 2 0 0 0 1.0 0.0 0.0 0.0 0 0 0 0 0 0 0 0 0 0 2051 2050 0 20 0 0 0 0 0.0 0.0 0.0 0.0 NeedManualLabel
3 192.168.1.7-192.168.1.9-49666-30692-6 192.168.1.7 49666 192.168.1.9 30692 6 22/04/2019 08:01:16 AM 1 1 1 6.0 6.0 6.0 6.0 6.0 0.000000 6.0 6.0 6.0 0.0 1.2E7 2000000.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 20 20 1000000.0 1000000.0 6.0 6.0 6.0 0.000000 0.0 1 0 0 0 2 0 0 0 1.0 9.0 6.0 6.0 0 0 0 0 0 0 0 3 0 3 2051 2050 0 20 0 0 0 0 0.0 0.0 0.0 0.0 NeedManualLabel
4 192.168.1.7-192.168.1.9-49666-30692-6 192.168.1.7 49666 192.168.1.9 30692 6 22/04/2019 08:01:16 AM 1 2 0 6.0 0.0 6.0 0.0 3.0 4.242641 0.0 0.0 0.0 0.0 6000000.0 2000000.0 1.0 0.0 1.0 1.0 1.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 40 0 2000000.0 0.0 0.0 6.0 2.0 3.464102 12.0 2 0 0 0 2 0 0 0 0.0 3.0 3.0 0.0 0 0 0 0 0 0 1 3 0 0 2051 0 1 20 0 0 0 0 0.0 0.0 0.0 0.0 NeedManualLabel

Attack Characterization

These are baselines samples. They are thus all labelled benign.

In [4]:
df['Label'] = 0
In [5]:
df.head()
Out[5]:
Flow ID Src IP Src Port Dst IP Dst Port Protocol Timestamp Flow Duration Total Fwd Packet Total Bwd packets Total Length of Fwd Packet Total Length of Bwd Packet Fwd Packet Length Max Fwd Packet Length Min Fwd Packet Length Mean Fwd Packet Length Std Bwd Packet Length Max Bwd Packet Length Min Bwd Packet Length Mean Bwd Packet Length Std Flow Bytes/s Flow Packets/s Flow IAT Mean Flow IAT Std Flow IAT Max Flow IAT Min Fwd IAT Total Fwd IAT Mean Fwd IAT Std Fwd IAT Max Fwd IAT Min Bwd IAT Total Bwd IAT Mean Bwd IAT Std Bwd IAT Max Bwd IAT Min Fwd PSH Flags Bwd PSH Flags Fwd URG Flags Bwd URG Flags Fwd Header Length Bwd Header Length Fwd Packets/s Bwd Packets/s Packet Length Min Packet Length Max Packet Length Mean Packet Length Std Packet Length Variance FIN Flag Count SYN Flag Count RST Flag Count PSH Flag Count ACK Flag Count URG Flag Count CWE Flag Count ECE Flag Count Down/Up Ratio Average Packet Size Fwd Segment Size Avg Bwd Segment Size Avg Fwd Bytes/Bulk Avg Fwd Packet/Bulk Avg Fwd Bulk Rate Avg Bwd Bytes/Bulk Avg Bwd Packet/Bulk Avg Bwd Bulk Rate Avg Subflow Fwd Packets Subflow Fwd Bytes Subflow Bwd Packets Subflow Bwd Bytes FWD Init Win Bytes Bwd Init Win Bytes Fwd Act Data Pkts Fwd Seg Size Min Active Mean Active Std Active Max Active Min Idle Mean Idle Std Idle Max Idle Min Label
0 192.168.1.105-64.4.54.254-61890-443-6 192.168.1.105 61890 64.4.54.254 443 6 22/04/2019 08:01:12 AM 4 2 0 6.0 0.0 6.0 0.0 3.0 4.242641 0.0 0.0 0.0 0.0 1500000.0 500000.0 4.0 0.0 4.0 4.0 4.0 4.0 0.0 4.0 4.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 40 0 500000.0 0.0 0.0 6.0 2.0 3.464102 12.0 2 0 0 0 2 0 0 0 0.0 3.0 3.0 0.0 0 0 0 0 0 0 1 3 0 0 253 0 1 20 0 0 0 0 0.0 0.0 0.0 0.0 0
1 64.4.54.254-192.168.1.105-443-61890-6 64.4.54.254 443 192.168.1.105 61890 6 22/04/2019 08:01:12 AM 4 2 0 6.0 0.0 6.0 0.0 3.0 4.242641 0.0 0.0 0.0 0.0 1500000.0 500000.0 4.0 0.0 4.0 4.0 4.0 4.0 0.0 4.0 4.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 40 0 500000.0 0.0 0.0 6.0 2.0 3.464102 12.0 2 0 0 0 2 0 0 0 0.0 3.0 3.0 0.0 0 0 0 0 0 0 1 3 0 0 1024 0 1 20 0 0 0 0 0.0 0.0 0.0 0.0 0
2 192.168.1.7-192.168.1.9-49666-30692-6 192.168.1.7 49666 192.168.1.9 30692 6 22/04/2019 08:01:16 AM 0 1 1 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 NaN Infinity 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 20 20 0.0 0.0 0.0 0.0 0.0 0.000000 0.0 1 0 0 0 2 0 0 0 1.0 0.0 0.0 0.0 0 0 0 0 0 0 0 0 0 0 2051 2050 0 20 0 0 0 0 0.0 0.0 0.0 0.0 0
3 192.168.1.7-192.168.1.9-49666-30692-6 192.168.1.7 49666 192.168.1.9 30692 6 22/04/2019 08:01:16 AM 1 1 1 6.0 6.0 6.0 6.0 6.0 0.000000 6.0 6.0 6.0 0.0 1.2E7 2000000.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 20 20 1000000.0 1000000.0 6.0 6.0 6.0 0.000000 0.0 1 0 0 0 2 0 0 0 1.0 9.0 6.0 6.0 0 0 0 0 0 0 0 3 0 3 2051 2050 0 20 0 0 0 0 0.0 0.0 0.0 0.0 0
4 192.168.1.7-192.168.1.9-49666-30692-6 192.168.1.7 49666 192.168.1.9 30692 6 22/04/2019 08:01:16 AM 1 2 0 6.0 0.0 6.0 0.0 3.0 4.242641 0.0 0.0 0.0 0.0 6000000.0 2000000.0 1.0 0.0 1.0 1.0 1.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 40 0 2000000.0 0.0 0.0 6.0 2.0 3.464102 12.0 2 0 0 0 2 0 0 0 0.0 3.0 3.0 0.0 0 0 0 0 0 0 1 3 0 0 2051 0 1 20 0 0 0 0 0.0 0.0 0.0 0.0 0

Data Exploration

General data exploration of the sample

In [16]:
# Plot the amount of traffic from Source IP addresses

plt.figure(figsize=(200,10))
plot = sns.countplot(x="Src IP",data=df)
plot.set_xticklabels(plot.get_xticklabels(), rotation=40, ha="right", fontsize=40)
plt.tight_layout()
plt.show()
In [18]:
# Plot the amount of traffic from Destination IP addresses

plt.figure(figsize=(240,15))
plot = sns.countplot(x="Dst IP",data=df)
plot.set_xticklabels(plot.get_xticklabels(), rotation=40, ha="right", fontsize=40)
plt.tight_layout()
plt.show()
In [8]:
# Plot the amount of traffic over Protocols
plt.figure(figsize=(10,4))
plot = sns.countplot(x="Protocol",data=df)
plot.set_xticklabels(plot.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()
In [9]:
sp_df = df['Src Port'].copy()
values = sp_df.value_counts(sort=True)
values_df = pd.DataFrame(values)
values_df = values_df.reset_index()
values_df.columns = ['port','count']
values_df
Out[9]:
port count
0 49666 48382
1 80 37198
2 135 18813
3 88 14390
4 5985 14222
5 49667 12449
6 389 11283
7 445 11260
8 0 7102
9 49675 7028
10 138 6568
11 123 5840
12 443 4621
13 137 4014
14 68 2538
15 139 2519
16 25 1178
17 3268 1155
18 64664 1002
19 61916 983
20 49669 862
21 56493 843
22 49671 840
23 51267 789
24 58560 786
... ... ...
50641 11538 1
50642 32534 1
50643 7956 1
50644 38667 1
50645 9501 1
50646 27412 1
50647 21267 1
50648 42536 1
50649 41741 1
50650 10524 1
50651 36397 1
50652 24853 1
50653 26900 1
50654 22070 1
50655 18704 1
50656 18192 1
50657 9500 1
50658 27413 1
50659 45838 1
50660 43789 1
50661 22582 1
50662 14394 1
50663 24348 1
50664 32536 1
50665 8188 1

50666 rows × 2 columns

In [10]:
i = sns.set(style='ticks')
i = sns.relplot(x="port", y="count", data=values_df, aspect=3)
i.set_axis_labels('source port').set(xticks=[1024, 49000, 65535])
plt.show(i)
In [11]:
dp_df = df['Dst Port'].copy()
values = dp_df.value_counts(sort=True)
values_df = pd.DataFrame(values)
values_df = values_df.reset_index()
values_df.columns = ['port','count']
values_df
Out[11]:
port count
0 389 184585
1 88 133641
2 53 119471
3 80 75112
4 49666 69799
5 123 56166
6 445 33793
7 135 30982
8 5985 30699
9 3268 23624
10 49667 18899
11 443 18527
12 49675 9644
13 0 7102
14 138 6568
15 139 4747
16 5355 4402
17 137 4014
18 49669 3091
19 67 2538
20 25 1609
21 49671 1600
22 56494 659
23 50493 594
24 51139 590
... ... ...
34653 14661 1
34654 26445 1
34655 15174 1
34656 47958 1
34657 47765 1
34658 23370 1
34659 35664 1
34660 37715 1
34661 43860 1
34662 41813 1
34663 14981 1
34664 12932 1
34665 6785 1
34666 9541 1
34667 8002 1
34668 25933 1
34669 30031 1
34670 40274 1
34671 47253 1
34672 30861 1
34673 18571 1
34674 10375 1
34675 48470 1
34676 12420 1
34677 8188 1

34678 rows × 2 columns

In [12]:
less_than_1024 = values_df[values_df.port < 1023]
less_than_1024
j = sns.set(style='ticks')
j = sns.relplot(x="port", y="count", data=less_than_1024, aspect=3)
#j.set_axis_labels('port').set(xticks=[0,1024])
plt.show(j)
In [13]:
print("--- %s seconds ---" % (time.time() - start_time))
--- 44.64839291572571 seconds ---
In [14]:
# Save the csv with the labels
df.to_csv("NIDS-AML-Baselines-queen-labeled.csv", encoding='utf-8', index=False)
In [ ]: